data_train <- read.csv2("spreadsheets/data_train.csv", sep = ",")
data_train <- na.omit(data_train)
data_train <- fact.data(data_train)
dim(data_train)
## [1] 784 32
head(data_train)
data_test <- read.csv2("spreadsheets/data_test.csv", sep = ",")
data_test <- na.omit(data_test)
data_test <- fact.data(data_test)
dim(data_test)
## [1] 332 31
head(data_test)
data_train_num <- data_train[, unlist(lapply(data_train, is.numeric))]
data_train_num[16] <- data_train["Attrition"]
dim(data_train_num)
## [1] 784 16
head(data_train_num)
data_test_num <- data_test[, unlist(lapply(data_test, is.numeric))]
dim(data_test_num)
## [1] 332 15
head(data_test_num)
library(FactoMineR)
data_train_log <- log(data_train_num[-16])
data_train_log[data_train_log == -Inf] <- 0
data_train_log <- t(scale(t(data_train_log)))
data_train_log <- as.data.frame(data_train_log)
data_train_log[16] <- data_train["Attrition"]
coord_data_train <- PCA(data_train_log, scale.unit = TRUE, graph = FALSE, quali.sup = 16)$ind$coord[,1:2]
plot(coord_data_train[,1], coord_data_train[,2], col = data_train$Attrition, xlab = "Axe 1", ylab = "Axe 2")
legend('topright', legend = levels(data_train$Attrition), col = 1:2, cex = 0.8, pch = 1)
data_test_log <- log(data_test_num)
data_test_log[data_test_log == -Inf] <- 0
data_test_log <- t(scale(t(data_test_log)))
data_test_log <- as.data.frame(data_test_log)
coord_data_test <- PCA(data_test_log, scale.unit = TRUE, graph = FALSE)$ind$coord[,1:2]
plot(coord_data_test[,1], coord_data_test[,2], xlab = "Axe 1", ylab = "Axe 2")
library(klaR)
partimat(coord_data_train, grouping = data_train_num$Attrition, method = "lda")
partimat(coord_data_train, grouping = data_train_num$Attrition, method = "qda")
res.kmeans <- kmeans(data_train_num[-16], centers = 2, nstart = 1000)
plot(coord_data_train, col = res.kmeans$cluster, pch = as.numeric(data_train$Attrition))
plot(table(res.kmeans$cluster, data_train$Attrition))
## Modèle
cah.ward <- hclust(dist(data_train_num), method = "ward.D2")
## Selection de 2 cluster (choix binaire)
plot(cah.ward, hang = -1)
rect.hclust(cah.ward, 2)
res.cah <- cutree(cah.ward, 2)
plot(coord_data_train, col = res.cah, pch = as.numeric(data_train$Attrition))
plot(table(res.cah, data_train$Attrition))
res.qda = qda(data_train_num[-16], grouping = data_train_num$Attrition)
res.qda
## Call:
## qda(data_train_num[-16], grouping = data_train_num$Attrition)
##
## Prior probabilities of groups:
## No Yes
## 0.8354592 0.1645408
##
## Group means:
## Age DailyRate DistanceFromHome EmployeeNumber HourlyRate MonthlyIncome
## No 38.77099 792.5939 9.503817 1023.669 66.86260 7162.046
## Yes 34.42636 756.1938 10.449612 1039.922 67.96899 4947.279
## MonthlyRate NumCompaniesWorked PercentSalaryHike TotalWorkingYears
## No 14124.12 2.708397 15.32672 12.670229
## Yes 14534.25 3.038760 15.16279 8.387597
## TrainingTimesLastYear YearsAtCompany YearsInCurrentRole
## No 2.781679 7.767939 4.687023
## Yes 2.604651 5.240310 2.798450
## YearsSinceLastPromotion YearsWithCurrManager
## No 2.343511 4.465649
## Yes 1.837209 2.821705
pred.qda = predict(res.qda, data_train_num[-16])$class
table(data_train_num$Attrition, pred.qda)
## pred.qda
## No Yes
## No 553 102
## Yes 59 70
Sur les Yes prédits on a plus d’erreurs que de cas juste alors que ce n’est pas le cas avec les prédiction sur No.
library(DMwR)
table(data_train_num$Attrition)
##
## No Yes
## 655 129
data_train_bal <- SMOTE(Attrition ~ ., data_train_num)
table(data_train_bal$Attrition)
##
## No Yes
## 516 387
library(MASS)
## Modèle
res.lda <- lda(data_train_bal[-16], grouping = data_train_bal$Attrition)
res.qda <- qda(data_train_bal[-16], grouping = data_train_bal$Attrition)
## Prédiction
pred.lda <- predict(res.lda, newdata = data_train_bal[-16])
pred.qda <- predict(res.qda, newdata = data_train_bal[-16])
## Table de confusion
conf.lda <- table(pred.lda$class, data_train_bal$Attrition)
accuracy.lda <- (conf.lda[1,1] + conf.lda[2,2]) / sum(conf.lda)
plot(conf.lda)
conf.qda <- table(pred.qda$class, data_train_bal$Attrition)
accuracy.qda <- (conf.qda[1,1] + conf.qda[2,2]) / sum(conf.qda)
plot(conf.qda)
## courbe ROC
library(pROC)
ROC.lda <- roc(data_train_bal$Attrition, pred.lda$posterior[,2])
ROC.qda <- roc(data_train_bal$Attrition, pred.qda$posterior[,2])
plot(ROC.lda, print.auc=TRUE, print.auc.y = 0.5, col = 1)
plot(ROC.qda, add = TRUE, print.auc=TRUE, print.auc.y = 0.45, col = 2)
legend("bottomright", lwd = 1, col = 1:2, c("LDA", "QDA"))
library(klaR)
## Modèle
stepwise.lda = stepclass(data_train_bal[-16], grouping = data_train_bal$Attrition, method = "lda", direction = "backward")
## correctness rate: 0.66889; starting variables (15): Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager
## correctness rate: 0.68332; out: "DailyRate"; variables (14): Age, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager
## correctness rate: 0.68885; out: "HourlyRate"; variables (13): Age, DistanceFromHome, EmployeeNumber, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager
## correctness rate: 0.68996; out: "MonthlyIncome"; variables (12): Age, DistanceFromHome, EmployeeNumber, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, TrainingTimesLastYear, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager
##
## hr.elapsed min.elapsed sec.elapsed
## 0.000 0.000 3.369
stepwise.lda
## method : lda
## final model : data_train_bal$Attrition ~ Age + DistanceFromHome + EmployeeNumber +
## MonthlyRate + NumCompaniesWorked + PercentSalaryHike + TotalWorkingYears +
## TrainingTimesLastYear + YearsAtCompany + YearsInCurrentRole +
## YearsSinceLastPromotion + YearsWithCurrManager
## <environment: 0x7fa2c0818438>
##
## correctness rate = 0.69
res.stepwise.lda = lda(stepwise.lda$formula, data = data_train_bal[-16])
## Prédiction
pred.stepwise.lda <- predict(res.stepwise.lda, newdata = data_train_bal[-16])
## Table de confusion
conf.stepwise.lda <- table(pred.stepwise.lda$class, data_train_bal$Attrition)
accuracy.stepwise.lda <- (conf.stepwise.lda[1,1] + conf.stepwise.lda[2,2]) / sum(conf.stepwise.lda)
plot(conf.stepwise.lda)
## courbe ROC
ROC.stepwise.lda <- roc(data_train_bal$Attrition, pred.stepwise.lda$posterior[,2])
plot(ROC.stepwise.lda, print.auc=TRUE, print.auc.y = 0.5)
legend("bottomright", lwd = 1, col = 1, "LDA stepwise")
library(rpart)
library(rpart.plot)
## Modèle
arbre.cart = rpart(data_train_bal$Attrition ~ ., data = data_train_bal[-16], control = rpart.control(minsplit = 5, cp = 0))
plotcp(arbre.cart)
## Optimisation de l'arbre
cp.opt <- arbre.cart$cptable[which.min(arbre.cart$cptable[, "xerror"]), "CP"]
arbre.opt <- prune(arbre.cart, cp = cp.opt)
rpart.plot(arbre.opt, type=4, digits=2)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
## Prédiction
pred.cart.class <- predict(arbre.opt, newdata = data_train_bal[-16], type = "class")
pred.cart.prob <- predict(arbre.opt, newdata = data_train_bal[-16], type = "prob")[,2]
## Table de confusion
conf.cart <- table(pred.cart.class, data_train_bal$Attrition)
accuracy.cart <- (conf.cart[1,1] + conf.cart[2,2]) / sum(conf.cart)
plot(conf.cart)
## courbe ROC
ROC.cart <- roc(data_train_bal$Attrition, pred.cart.prob)
plot(ROC.cart, print.auc=TRUE, print.auc.y = 0.5, col = 1)
legend("bottomright", lwd = 1, col = 1, "CART")
library(randomForest)
## Modèle
res.RF <- randomForest(data_train_bal$Attrition ~ ., data_train_bal[-16])
res.RF
##
## Call:
## randomForest(formula = data_train_bal$Attrition ~ ., data = data_train_bal[-16])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 10.63%
## Confusion matrix:
## No Yes class.error
## No 487 29 0.05620155
## Yes 67 320 0.17312661
## Prédiction
pred.RF.class <- predict(res.RF, newdata = data_train_bal[-16], type="class")
pred.RF.prob <- predict(res.RF, newdata = data_train_bal[-16], type = "prob")[,2]
## Table de confusion
conf.RF <- table(pred.RF.class, data_train_bal$Attrition)
accuracy.RF <- (conf.RF[1,1] + conf.RF[2,2]) / sum(conf.RF)
plot(conf.RF)
## courbe ROC
ROC.RF <- roc(data_train_bal$Attrition, pred.RF.prob)
plot(ROC.RF, print.auc=TRUE, print.auc.y = 0.5, col = 1)
legend("bottomright", lwd = 1, col = 1, "Random Forest")
library(glmnet)
## Modèle
res.Lasso <- glmnet(as.matrix(data_train_bal[-16]), data_train_bal$Attrition, family='binomial')
cv.Lasso <- cv.glmnet(as.matrix(data_train_bal[-16]), data_train_bal$Attrition, family="binomial", type.measure = "class")
plot(cv.Lasso)
## Prédiction
pred.lasso.class <- predict(cv.Lasso, newx = as.matrix(data_train_bal[-16]), s = 'lambda.min', type = "class")
pred.lasso.prob <- predict(cv.Lasso, newx = as.matrix(data_train_bal[-16]), s = 'lambda.min', type = "response")[,1]
## Table de confusion
conf.lasso <- table(pred.lasso.class, data_train_bal$Attrition)
accuracy.lasso <- (conf.lasso[1,1] + conf.lasso[2,2]) / sum(conf.lasso)
plot(conf.lasso)
## courbe ROC
ROC.lasso <- roc(data_train_bal$Attrition, pred.lasso.prob)
plot(ROC.lasso, print.auc=TRUE, print.auc.y = 0.5, col = 1)
legend("bottomright", lwd = 1, col = 1, "Regression Logistique Lasso")
result = matrix(NA, ncol = 6, nrow = 2)
rownames(result) = c('accuracy', 'AUC')
colnames(result) = c('LDA', 'QDA', 'LDA stepwise', 'CART', 'Random Forest', 'Reg. Logi. Lasso')
result[1,] = c(accuracy.lda, accuracy.qda, accuracy.stepwise.lda, accuracy.cart, accuracy.RF, accuracy.lasso)
result[2,] = c(ROC.lda$auc, ROC.qda$auc, ROC.stepwise.lda$auc, ROC.cart$auc, ROC.RF$auc, ROC.lasso$auc)
result
## LDA QDA LDA stepwise CART Random Forest
## accuracy 0.6821705 0.7187154 0.6998893 0.9700997 1
## AUC 0.7496044 0.8135629 0.7447068 0.9906982 1
## Reg. Logi. Lasso
## accuracy 0.6854928
## AUC 0.7508964
apply(result, 1, which.max )
## accuracy AUC
## 5 5
plot(ROC.lda, xlim = c(1,0))
plot(ROC.qda, add = TRUE, col = 2)
plot(ROC.stepwise.lda, add = TRUE, col = 3)
plot(ROC.cart, add = TRUE, col = 4)
plot(ROC.RF, add = TRUE, col = 5)
plot(ROC.lasso, add = TRUE, col = 6)
legend('bottomright', col = 1:6, paste(colnames(result)), lwd = 1)
La meilleure méthode de prédiction en tout point est le random Forest.
pred.Attrition <- predict(res.RF, newdata = data_test_num, type="class")
plot(coord_data_test, col = pred.Attrition)
data_test_pred <- data.frame(pred.Attrition, data_test)
write.csv(data_test_pred, file = "prediction.csv", quote = FALSE, sep = ',')